/*
* Terrier - Terabyte Retriever
* Webpage: http://terrier.org
* Contact: terrier{a.}dcs.gla.ac.uk
* University of Glasgow - School of Computing Science
* http://www.gla.ac.uk/
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is SortAscendingTripleVectors.java.
*
* The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
* All Rights Reserved.
*
* Contributor(s):
* Craig Macdonald <craigm{a.}dcs.gla.ac.uk
* Richard McCreadie <richardm{a.}dcs.gla.ac.uk
*/
package org.terrier.indexing.hadoop;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TaskAttemptID;
import org.terrier.compression.BitIn;
import org.terrier.compression.BitOutputStream;
import org.terrier.indexing.BasicSinglePassIndexer;
import org.terrier.indexing.Document;
import org.terrier.structures.BasicLexiconEntry;
import org.terrier.structures.DocumentIndexEntry;
import org.terrier.structures.FSOMapFileLexiconOutputStream;
import org.terrier.structures.FieldDocumentIndexEntry;
import org.terrier.structures.FieldLexiconEntry;
import org.terrier.structures.Index;
import org.terrier.structures.IndexUtil;
import org.terrier.structures.LexiconOutputStream;
import org.terrier.structures.SimpleDocumentIndexEntry;
import org.terrier.structures.indexing.CompressingMetaIndexBuilder;
import org.terrier.structures.indexing.DocumentIndexBuilder;
import org.terrier.structures.indexing.MetaIndexBuilder;
import org.terrier.structures.indexing.singlepass.FieldPostingInRun;
import org.terrier.structures.indexing.singlepass.RunsMerger;
import org.terrier.structures.indexing.singlepass.SimplePostingInRun;
import org.terrier.structures.indexing.singlepass.hadoop.HadoopRunIteratorFactory;
import org.terrier.structures.indexing.singlepass.hadoop.HadoopRunWriter;
import org.terrier.structures.indexing.singlepass.hadoop.HadoopRunsMerger;
import org.terrier.structures.indexing.singlepass.hadoop.IDComparator;
import org.terrier.structures.indexing.singlepass.hadoop.MapData;
import org.terrier.structures.indexing.singlepass.hadoop.MapEmittedPostingList;
import org.terrier.structures.indexing.singlepass.hadoop.SplitAwareWrapper;
import org.terrier.structures.indexing.singlepass.hadoop.SplitEmittedTerm;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.ArrayUtils;
import org.terrier.utility.FieldScore;
import org.terrier.utility.Files;
import org.terrier.utility.io.HadoopPlugin;
import org.terrier.utility.io.HadoopUtility;
import org.terrier.utility.io.WrappedIOException;
import org.terrier.utility.io.HadoopPlugin.JobFactory;
/**
* Single Pass MapReduce indexer.
* <p><h3>Map phase processing</h3>
* Indexes as a Map task, taking in a series of documents, emitting posting lists for terms as
* memory becomes exhausted. Two side-files are created for each map task: the first (run files) takes note of how many documents were indexed
* for each flush and for each map; the second contains the statistics for each document in a minature document index
* </p>
* <p><h3>Reduce phase processing</h3>
* All posting lists for each term are read in, one term at a time. Using the run files, the posting lists are output into the final inverted
* file, with all document ids corrected. Lastly, when all terms have been processed, the document indexes are merged into the final document
* index, and the lexicon hash and lexid created.
* </p>
* <p><h3>Partitioned Reduce processing</h3>
* Normally, the MapReduce indexer is used with a single reducer. However, if the partitioner is used, multiple reduces can run concurrently,
* building several final indices. In doing so, a large collection can be indexed into several output indices, which may be useful for distributed
* retrieval.
* </p>
* @author Richard McCreadie and Craig Macdonald
* @since 2.2
*/
@SuppressWarnings("deprecation")
public class Hadoop_BasicSinglePassIndexer
extends BasicSinglePassIndexer
implements Mapper<Text, SplitAwareWrapper<Document>, SplitEmittedTerm, MapEmittedPostingList>,
Reducer<SplitEmittedTerm, MapEmittedPostingList, Object, Object>
{
/**
* main
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception
{
if (args.length == 2 && args[0].equals("--finish"))
{
final JobFactory jf = HadoopPlugin.getJobFactory("HOD-TerrierIndexing");
if (jf == null)
throw new Exception("Could not get JobFactory from HadoopPlugin");
try{
finish(ApplicationSetup.TERRIER_INDEX_PATH, Integer.parseInt(args[1]), jf);
} catch (Exception e) {
logger.error("Couldn't finish index", e);
} finally {
jf.close();
}
}
else
{
System.err.println("Usage: Hadoop_BasicSinglePassIndexer [--finish numberOfReduceTasks]");
}
}
/**
* finish
* @param destinationIndexPath
* @param numberOfReduceTasks
* @param jf
* @throws Exception
*/
public static void finish(final String destinationIndexPath, int numberOfReduceTasks, final JobFactory jf) throws Exception
{
final String[] reverseMetaKeys = ApplicationSetup.getProperty("indexer.meta.reverse.keys", "docno").split("\\s*,\\s*");
Index.setIndexLoadingProfileAsRetrieval(false);
if (numberOfReduceTasks == 1)
{
Index index = Index.createIndex(destinationIndexPath, ApplicationSetup.TERRIER_INDEX_PREFIX);
if (index == null)
{
throw new IOException("No such index ["+destinationIndexPath+","+ApplicationSetup.TERRIER_INDEX_PREFIX+"]");
}
CompressingMetaIndexBuilder.reverseAsMapReduceJob(index, "meta", reverseMetaKeys, jf);
index.close();
return;
}
//make a list of MR jobs in separate threads
List<Thread> threads = new ArrayList<Thread>(numberOfReduceTasks);
for(int i=0;i<numberOfReduceTasks;i++)
{
final int id = i;
threads.add(new Thread() {
@Override
public void run() {
try{
Index index = Index.createIndex(destinationIndexPath, ApplicationSetup.TERRIER_INDEX_PREFIX+"-"+id);
CompressingMetaIndexBuilder.reverseAsMapReduceJob(index, "meta", reverseMetaKeys, jf);
index.close();
} catch (Exception e) {
logger.error("Problem finishing meta", e);
e.printStackTrace();
}
}
});
}
//start the threads
for(Thread t : threads)
t.start();
//wait for the threads to end
for(Thread t : threads)
t.join();
}
static enum Counters {
INDEXED_DOCUMENTS, INDEXED_EMPTY_DOCUMENTS, INDEXER_FLUSHES, INDEXED_TOKENS, INDEXED_POINTERS;
};
/** JobConf of the current running job */
protected JobConf jc;
/** The split that these documents came form **/
protected int splitnum;
protected boolean start;
/**
* Empty constructor.
*/
public Hadoop_BasicSinglePassIndexer() {
super(0,0,0);
numberOfDocuments = currentId = numberOfDocsSinceCheck = numberOfDocsSinceFlush = numberOfUniqueTerms = 0;
numberOfTokens = numberOfPointers = 0;
flushNo=0;
flushList = new LinkedList<Integer>();
}
/** Configure this indexer. Firstly, loads ApplicationSetup appropriately.
* Actual configuration of indexer is then handled by configureMap() or configureReduce()
* depending on whether a Map or Reduce task is being configured.
* @param _jc The configuration for the job
*/
public void configure(JobConf _jc)
{
this.jc = _jc;
//1. configure application
try{
HadoopUtility.loadTerrierJob(_jc);
} catch (Exception e) {
throw new Error("Cannot load ApplicationSetup", e);
}
//2. configurure indexer
try{
if (HadoopUtility.isMap(_jc))
{
configureMap();
} else {
configureReduce();
}
} catch (Exception e) {
throw new Error("Cannot configure indexer", e);
}
}
/** Called when the Map or Reduce task ends, to finish up the indexer. Actual cleanup is
* handled by closeMap() or closeReduce() depending on whether this is a Map or Reduce task.
*/
public void close() throws IOException
{
if (HadoopUtility.isMap(jc))
{
closeMap();
} else {
closeReduce();
}
}
@Override
/** Hadoop indexer does not have the consideration of boundary documents. */
protected void load_builder_boundary_documents() { }
/* ==============================================================
* Map implementation from here down
* ==============================================================
*/
/** output collector for the current map indexing process */
protected OutputCollector<SplitEmittedTerm, MapEmittedPostingList> outputPostingListCollector;
/** Current map number */
protected String mapTaskID;
/** How many flushes have we made */
protected int flushNo;
/** OutputStream for the the data on the runs (runNo, flushes etc) */
protected DataOutputStream RunData;
/** List of how many documents are in each flush we have made */
protected LinkedList<Integer> flushList;
protected void configureMap() throws Exception
{
super.init();
Path indexDestination = FileOutputFormat.getWorkOutputPath(jc);
Files.mkdir(indexDestination.toString());
mapTaskID = TaskAttemptID.forName(jc.get("mapred.task.id")).getTaskID().toString();
currentIndex = Index.createNewIndex(indexDestination.toString(), mapTaskID);
maxMemory = Long.parseLong(ApplicationSetup.getProperty("indexing.singlepass.max.postings.memory", "0"));
//during reduce, we dont want to load indices into memory, as we only use
//them as streams
currentIndex.setIndexProperty("index.preloadIndices.disabled", "true");
RunData = new DataOutputStream(
Files.writeFileStream(
new Path(indexDestination, mapTaskID+".runs").toString())
);
RunData.writeUTF(mapTaskID);
start = true;
createMemoryPostings();
super.emptyDocIndexEntry = new SimpleDocumentIndexEntry();
super.docIndexBuilder = new DocumentIndexBuilder(currentIndex, "document");
super.metaBuilder = createMetaIndexBuilder();
emptyDocIndexEntry = (FieldScore.FIELDS_COUNT > 0) ? new FieldDocumentIndexEntry(FieldScore.FIELDS_COUNT) : new SimpleDocumentIndexEntry();
}
protected MetaIndexBuilder createMetaIndexBuilder()
{
final String[] forwardMetaKeys = ApplicationSetup.getProperty("indexer.meta.forward.keys", "docno").split("\\s*,\\s*");
final int[] metaKeyLengths = parseInts(ApplicationSetup.getProperty("indexer.meta.forward.keylens", "20").split("\\s*,\\s*"));
//no reverse metadata during main indexing, pick up as separate job later
return new CompressingMetaIndexBuilder(currentIndex, forwardMetaKeys, metaKeyLengths, new String[0]);
}
@edu.umd.cs.findbugs.annotations.SuppressWarnings(
value="DM_GC",
justification="Forcing GC is an essential part of releasing" +
"memory for further indexing")
/** causes the posting lists built up in memory to be flushed out */
protected void forceFlush() throws IOException
{
//logger.info("Map "+mapTaskID+", flush requested, containing "+numberOfDocsSinceFlush+" documents, flush "+flushNo);
if (mp == null)
throw new IOException("Map flushed before any documents were indexed");
mp.finish(new HadoopRunWriter(outputPostingListCollector, mapTaskID, splitnum, flushNo));
RunData.writeInt(currentId);
if (currentReporter != null)
currentReporter.incrCounter(Counters.INDEXER_FLUSHES, 1);
System.gc();
createMemoryPostings();
memoryCheck.reset();
numberOfDocsSinceFlush = 0;
currentId = 0;
flushNo++;
}
/**
* Map processes a single document. Stores the terms in the document along with the posting list
* until memory is full or all documents in this map have been processed then writes then to
* the output collector.
* @param key - Wrapper for Document Number
* @param value - Wrapper for Document Object
* @param _outputPostingListCollector Collector for emitting terms and postings lists
* @throws IOException
*/
public void map(
Text key, SplitAwareWrapper<Document> value,
OutputCollector<SplitEmittedTerm, MapEmittedPostingList> _outputPostingListCollector,
Reporter reporter)
throws IOException
{
final String docno = key.toString();
currentReporter = reporter;
reporter.setStatus("Currently indexing "+docno);
final Document doc = value.getObject();
if (start) {
splitnum = value.getSplitIndex();
System.out.println(splitnum);
//RunData.writeInt(splitnum);
start = false;
}
this.outputPostingListCollector = _outputPostingListCollector;
/* setup for parsing */
createDocumentPostings();
String term;//term we're currently processing
numOfTokensInDocument = 0;
//numberOfDocuments++;
//get each term in the document
while (!doc.endOfDocument()) {
reporter.progress();
if ((term = doc.getNextTerm())!=null && !term.equals("")) {
termFields = doc.getFields();
/* pass term into TermPipeline (stop, stem etc) */
pipeline_first.processTerm(term);
/* the term pipeline will eventually add the term to this object. */
}
if (MAX_TOKENS_IN_DOCUMENT > 0 &&
numOfTokensInDocument > MAX_TOKENS_IN_DOCUMENT)
break;
}
//if we didn't index all tokens from document,
//we need tocurrentId get to the end of the document.
while (!doc.endOfDocument()){
doc.getNextTerm();
}
/* we now have all terms in the DocumentTree, so we save the document tree */
if (termsInDocument.getDocumentLength() == 0)
{ /* this document is empty, add the minimum to the document index */
// Nothing in the ifile
indexEmpty(doc.getAllProperties());
}
else
{ /* index this document */
try{
indexDocument(doc.getAllProperties(), termsInDocument);
numberOfTokens += numOfTokensInDocument;
reporter.incrCounter(Counters.INDEXED_TOKENS, numOfTokensInDocument);
reporter.incrCounter(Counters.INDEXED_POINTERS, termsInDocument.getNumberOfPointers());
} catch (IOException ioe) {
throw ioe;
} catch (Exception e) {
throw new WrappedIOException(e);
}
}
termsInDocument.clear();
reporter.incrCounter(Counters.INDEXED_DOCUMENTS, 1);
}
protected Reporter currentReporter;
/**
* Write the empty document to the inverted index
*/
protected void indexEmpty(final Map<String,String> docProperties) throws IOException
{
/* add doc to documentindex, even though it's empty */
if(IndexEmptyDocuments)
{
//logger.warn("Adding empty document "+docProperties.get("docno"));
docIndexBuilder.addEntryToBuffer(emptyDocIndexEntry);
metaBuilder.writeDocumentEntry(docProperties);
currentId++;
numberOfDocuments++;
currentReporter.incrCounter(Counters.INDEXED_EMPTY_DOCUMENTS, 1);
}
}
/** Finish up the map processing. Forces a flush, then writes out the final run data */
protected void closeMap() throws IOException
{
forceFlush();
docIndexBuilder.finishedCollections();
currentIndex.setIndexProperty("index.inverted.fields.count", ""+FieldScore.FIELDS_COUNT);
if (FieldScore.FIELDS_COUNT > 0)
{
currentIndex.addIndexStructure("document-factory", FieldDocumentIndexEntry.Factory.class.getName(), "java.lang.String", "${index.inverted.fields.count}");
}
else
{
currentIndex.addIndexStructure("document-factory", SimpleDocumentIndexEntry.Factory.class.getName(), "", "");
}
metaBuilder.close();
currentIndex.flush();
currentIndex.close();
RunData.writeInt(-1);
RunData.writeInt(numberOfDocuments);
RunData.writeInt(splitnum);
RunData.close();
//logger.info("Map "+mapTaskID+ " finishing, indexed "+numberOfDocuments+ " in "+(flushNo-1)+" flushes");
}
/* ==============================================================
* Reduce implementation from here down
* ==============================================================
*/
/** OutputStream for the Lexicon*/
protected LexiconOutputStream<String> lexstream;
/** runIterator factory being used to generate RunIterators */
protected HadoopRunIteratorFactory runIteratorF = null;
/** records whether the reduce() has been called for the first time */
protected boolean reduceStarted = false;
protected boolean mutipleIndices = true;
protected int reduceId;
protected String[] MapIndexPrefixes = null;
protected Reporter lastReporter = null;
protected void configureReduce() throws Exception
{
super.init();
start = true;
//load in the current index
final Path indexDestination = FileOutputFormat.getWorkOutputPath(jc);
Files.mkdir(path = indexDestination.toString());
final String indexDestinationPrefix = jc.get("indexing.hadoop.prefix", "data");
reduceId = TaskAttemptID.forName(jc.get("mapred.task.id")).getTaskID().getId();
indexDestination.toString();
mutipleIndices = jc.getBoolean("indexing.hadoop.multiple.indices", true);
if (jc.getNumReduceTasks() > 1)
{
//gets the reduce number and suffices this to the index prefix
prefix = indexDestinationPrefix + "-"+reduceId;
}
else
{
prefix = indexDestinationPrefix;
}
currentIndex = Index.createNewIndex(path, prefix);
super.merger = createtheRunMerger();
reduceStarted = false;
}
protected LinkedList<MapData> loadRunData() throws IOException
{
// Load in Run Data
ArrayList<String> mapTaskIDs = new ArrayList<String>();
final LinkedList<MapData> runData = new LinkedList<MapData>();
DataInputStream runDataIn;
final String jobId = TaskAttemptID.forName(jc.get("mapred.task.id")).getJobID().toString().replaceAll("job", "task");
final FileStatus[] files = FileSystem.get(jc).listStatus(
FileOutputFormat.getOutputPath(jc),
new org.apache.hadoop.fs.PathFilter()
{
public boolean accept(Path path)
{
final String name = path.getName();
//1. is this a run file
if (!( name.startsWith( jobId ) && name.endsWith(".runs")))
return false;
return true;
}
}
);
if (files == null || files.length == 0)
{
throw new IOException("No run status files found in "+FileOutputFormat.getOutputPath(jc));
}
final int thisPartition = TaskAttemptID.forName(jc.get("mapred.task.id")).getTaskID().getId();
final SplitEmittedTerm.SETPartitioner partitionChecker = new SplitEmittedTerm.SETPartitioner();
partitionChecker.configure(jc);
MapData tempHRD;
for (FileStatus file : files)
{
//logger.info("Run data file "+ file.getPath().toString()+" has length "+Files.length(file.getPath().toString()));
runDataIn = new DataInputStream(Files.openFileStream(file.getPath().toString()));
tempHRD = new MapData(runDataIn);
//check to see if this file contaned our split information
if (mutipleIndices && partitionChecker.calculatePartition(tempHRD.getSplitnum(), jc.getNumReduceTasks()) != thisPartition)
continue;
mapTaskIDs.add(tempHRD.getMap());
runData.add(tempHRD);
runDataIn.close();
}
// Sort by splitnum
Collections.sort(runData);
Collections.sort(mapTaskIDs, new IDComparator(runData));
// A list of the index shards
MapIndexPrefixes = mapTaskIDs.toArray(new String[0]);
return runData;
}
/**
* Merge the postings for the current term, converts the document ID's in the
* postings to be relative to one another using the run number, number of documents
* covered in each run, the flush number for that run and the number of documents
* flushed.
* @param mapData - info about the runs(maps) and the flushes
*/
public void startReduce(LinkedList<MapData> mapData) throws IOException
{
//logger.info("The number of Reduce Tasks being used : "+jc.getNumReduceTasks());
((HadoopRunsMerger)(super.merger)).beginMerge(mapData);
this.currentIndex.setIndexProperty("max.term.length", ApplicationSetup.getProperty("max.term.length", ""+20));
lexstream = new FSOMapFileLexiconOutputStream(this.currentIndex, "lexicon",
(FieldScore.FIELDS_COUNT > 0 ? FieldLexiconEntry.Factory.class : BasicLexiconEntry.Factory.class));
// Tell the merger how many to Reducers to merge for
((HadoopRunsMerger) merger).setNumReducers(
mutipleIndices ? jc.getNumReduceTasks() : 1);
}
/** Main reduce algorithm step. Called for every term in the merged index, together with accessors
* to the posting list information that has been written.
* This reduce has no output.
* @param Term indexing term which we are reducing the posting lists into
* @param postingIterator Iterator over the temporary posting lists we have for this term
* @param output Unused output collector
* @param reporter Used to report progress
*/
public void reduce(
SplitEmittedTerm Term,
Iterator<MapEmittedPostingList> postingIterator,
OutputCollector<Object, Object> output,
Reporter reporter)
throws IOException
{
//if (logger.isDebugEnabled()) logger.debug("Reduce for term "+Term.getText());
reporter.setStatus("Reducer is merging term " + Term.getTerm());
if (! reduceStarted)
{
final LinkedList<MapData> runData = loadRunData();
startReduce(runData);
reduceStarted = true;
}
String term = Term.getTerm().trim();
if (term.length() == 0)
return;
runIteratorF.setRunPostingIterator(postingIterator);
runIteratorF.setTerm(term);
try{
merger.mergeOne(lexstream);
} catch (Exception e) {
throw new WrappedIOException(e);
}
reporter.progress();
this.lastReporter = reporter;
}
/** Merges the simple document indexes made for each map, instead creating the final document index */
@SuppressWarnings("unchecked")
protected void mergeDocumentIndex(Index[] src) throws IOException
{
//logger.info("Merging document and meta indices");
final DocumentIndexBuilder docidOutput = new DocumentIndexBuilder(currentIndex, "document");
final MetaIndexBuilder metaBuilder = this.createMetaIndexBuilder();
int i_index = 0;
int docCount =-1;
for (Index srcIndex: src)
{
final Iterator<DocumentIndexEntry> docidInput = (Iterator<DocumentIndexEntry>)srcIndex.getIndexStructureInputStream("document");
final Iterator<String[]> metaInput1 = (Iterator<String[]>)srcIndex.getIndexStructureInputStream("meta");
while (docidInput.hasNext())
{
docCount++;
docidOutput.addEntryToBuffer(docidInput.next());
metaBuilder.writeDocumentEntry(metaInput1.next());
this.lastReporter.progress();
}
IndexUtil.close(docidInput);
IndexUtil.close(metaInput1);
i_index++;
}
metaBuilder.close();
docidOutput.finishedCollections();
if (FieldScore.FIELDS_COUNT > 0)
{
currentIndex.addIndexStructure("document-factory", FieldDocumentIndexEntry.Factory.class.getName(), "java.lang.String", "${index.inverted.fields.count}");
}
else
{
currentIndex.addIndexStructure("document-factory", SimpleDocumentIndexEntry.Factory.class.getName(), "", "");
}
//logger.info("Finished merging document indices from "+src.length+" map tasks: "+docCount +" documents found");
}
/** finishes the reduce step, by closing the lexicon and inverted file output,
* building the lexicon hash and index, and merging the document indices created
* by the map tasks. The output index finalised */
protected void closeReduce() throws IOException {
if (! reduceStarted)
{
//logger.warn("No terms were input, skipping reduce close");
return;
}
//generate final index structures
//1. any remaining lexicon terms
merger.endMerge(lexstream);
//2. the end of the inverted file
merger.getBos().close();
lexstream.close();
//index updating is ONLY for
currentIndex.addIndexStructure(
"inverted",
invertedIndexClass,
"org.terrier.structures.Index,java.lang.String,org.terrier.structures.DocumentIndex,java.lang.Class",
"index,structureName,document,"+
(FieldScore.FIELDS_COUNT > 0
? fieldInvertedIndexPostingIteratorClass
: basicInvertedIndexPostingIteratorClass ));
currentIndex.addIndexStructureInputStream(
"inverted",
invertedIndexInputStreamClass,
"org.terrier.structures.Index,java.lang.String,java.util.Iterator,java.lang.Class",
"index,structureName,lexicon-entry-inputstream,"+
(FieldScore.FIELDS_COUNT > 0
? fieldInvertedIndexPostingIteratorClass
: basicInvertedIndexPostingIteratorClass ));
currentIndex.setIndexProperty("index.inverted.fields.count", ""+FieldScore.FIELDS_COUNT );
currentIndex.setIndexProperty("index.inverted.fields.names", ArrayUtils.join(FieldScore.FIELD_NAMES, ","));
//3. finalise the lexicon
currentIndex.setIndexProperty("num.Terms",""+ lexstream.getNumberOfTermsWritten() );
currentIndex.setIndexProperty("num.Tokens",""+lexstream.getNumberOfTokensWritten() );
currentIndex.setIndexProperty("num.Pointers",""+lexstream.getNumberOfPointersWritten() );
if (FieldScore.FIELDS_COUNT > 0)
currentIndex.addIndexStructure("lexicon-valuefactory", FieldLexiconEntry.Factory.class.getName(), "java.lang.String", "${index.inverted.fields.count}");
this.finishedInvertedIndexBuild();
//the document indices are only merged if we are creating multiple indices
//OR if this is the first reducer for a job creating a single index
if (mutipleIndices || reduceId == 0)
{
//4. document index
Index[] sourceIndices = new Index[MapIndexPrefixes.length];
for (int i= 0; i<MapIndexPrefixes.length;i++)
{
sourceIndices[i] = Index.createIndex(FileOutputFormat.getOutputPath(jc).toString(), MapIndexPrefixes[i]);
if (sourceIndices[i] == null)
throw new IOException("Could not load index from ("
+FileOutputFormat.getOutputPath(jc).toString()+","+ MapIndexPrefixes[i] +") because "
+Index.getLastIndexLoadError());
}
this.mergeDocumentIndex(sourceIndices);
//5. close the map phase indices
for(Index i : sourceIndices)
{
i.close();
}
}
currentIndex.flush();
}
/** Creates the RunsMerger and the RunIteratorFactory */
protected RunsMerger createtheRunMerger() {
//logger.info("creating run merged with fields="+useFieldInformation);
runIteratorF =
new HadoopRunIteratorFactory(null,
(useFieldInformation
? FieldPostingInRun.class
: SimplePostingInRun.class),
super.numFields);
HadoopRunsMerger tempRM = new HadoopRunsMerger(runIteratorF);
try{
tempRM.setBos(new BitOutputStream(
currentIndex.getPath() + ApplicationSetup.FILE_SEPARATOR
+ currentIndex.getPrefix() + ".inverted" + BitIn.USUAL_EXTENSION));
} catch (IOException ioe) {
ioe.printStackTrace();
}
return (RunsMerger)tempRM;
}
}